Setup

Start up - Folder structure

(same as previous Rmd no need to do it twice)

Download datasets

  • ExerciceBisphenol.csv
  • ExercicePlasmaCholesterol.xlsx

And save them into your “data” folder.

Same .Rproject

Double check that you are still in the same Rproject (check on the top right of RStudio)

Load libraries

#load required packages
library(ggplot2) #to plot
library(readr) #to read files
library(here) #to find files


#if you want to make it reproducible (more advance)
# if(!require(pacman)) {install.packages(c("pacman"))}
# 
# pacman::p_load(here,ggplot2, readr)


#these are just Rmarkdown visual options
knitr::opts_chunk$set(message=F, warning=F, comment=NA) 

Find your path

#here() #from the here packages is a neat magick trick that will find where you files are 
#given that you are in a .Rproject, type help(here) for more info

Read the data

il18 <- read_csv(here("data", "freeIL18.csv")) #read the data
#you can also just read it manually
#il18 <- read_csv("C:/Users/david/Documents/cours_stat/freeIL18.csv")
il18
# A tibble: 73 x 4
   samples `Disease activity` `Free IL-18 (pg/ml)` `Total IL-18 (ng/ml)`
     <dbl> <chr>                             <dbl>                 <dbl>
 1       1 A                                  1.37                0.544 
 2       2 I                                  1.37                0.145 
 3       3 I                                  1.37                0.0893
 4       4 A                                  1.37                1.50  
 5       5 A                                  1.37                0.167 
 6       6 I                                  1.37                0.0478
 7       7 A                                 75.5                22.5   
 8       8 I                                  1.37                1.23  
 9       9 A                                187                  36.1   
10      10 I                                110                  30.0   
# ... with 63 more rows
# i Use `print(n = ...)` to see more rows
names(il18) <-  c("samples", "dis.act", "freeIL18", "TotalIL18") # assign new names to columns

Visualizations

Numerical

Density Plots

Show the density of a numeric variable

A density plot shows the distribution of a numeric variable. It takes only numeric variables as input and is very close from an histogram. It can be use in the same exact condition.

Tips:

  • Play with the bandwidth of your density function.

  • Don’t show the distribution of more than ~5 variables. Use Violin or Ridge line plot instead.

  • Avoid filling with color palettes.

Do a Basic density plot: x = freeIL18

#hint: ggplot(data, aes(...)) + geom_density()
#hint2: assign your plot to an object, e.g. density <- ggplot(data, aes(...)) + geom_density()


density <- ggplot(data = il18, aes(x = freeIL18)) + 
   geom_density( fill="green")

density

Density plot by groups (dis.act)

#hint use fill
ggplot() + 
  geom_density(data = il18,
             aes(x = freeIL18, fill=dis.act)) 

Stacked density plot

#hint: same as before but use position="fill" in geom_density

ggplot() + 
  geom_density(data = il18,
             aes(x = freeIL18, group=dis.act,  fill=dis.act), position="fill")

Data Manipulation

Load libraries

#install.packages("tidyr") #  data manipulation
#install.packages("readxl") # read excel files
#install.packages("writexl") # write excel files
#install.packages("scales") # work with different scales
 #<- you just need to run these lines once


library(tidyr)
library(readxl)
library(writexl)
library(scales)

Try to read a .csv file: ExerciceBisphenol.csv

#hint: use read_csv() and call it df_bisphenol


df_bisphenol <- read_csv(here("data", "ExerciceBisphenol.csv")) #read the data
#you can also just read it manually
#df_bisphenol <- read_csv("C:/Users/david/Documents/cours_stat/ExerciceBisphenol.csv")

df_bisphenol
# A tibble: 3 x 3
  group               Cancer NoCancer
  <chr>                <dbl>    <dbl>
1 Control                  1       29
2 Moderate exposition      2       28
3 High exposition         10       20

Try to read an excel file

#hint: use read_xlsx() and call it plasma

plasma <- read_xlsx(here("data", "ExercicePlasmaCholesterol.xlsx"))
#you can also just read it manually
#plasma <- read_xlsx("C:/Users/david/Documents/cours_stat/ExercicePlasmaCholesterol.xlsx")

plasma
# A tibble: 20 x 3
   Control `Treatment 1` `Treatment 2`
     <dbl>         <dbl>         <dbl>
 1    164.          191.          164.
 2    173.          171.          153.
 3    178.          184.          154.
 4    170.          172.          152.
 5    183.          176.          158.
 6    172.          168           156.
 7    188.          182.          157 
 8    178.          160.          160.
 9    186.          173.          160.
10    176.          170.          148.
11    171.          177.          153.
12    177.          162.          158.
13    171.          176.          158.
14    182.          177.          156.
15    179.          165.          157.
16    187.          160.          154.
17    166           165.          147.
18    178.          173.          159.
19    167.          169.           NA 
20    172.          172.           NA 

Look at the data

Output the summary of the Bisphenol data

#hint use summary() and or str()
summary(df_bisphenol)
    group               Cancer          NoCancer    
 Length:3           Min.   : 1.000   Min.   :20.00  
 Class :character   1st Qu.: 1.500   1st Qu.:24.00  
 Mode  :character   Median : 2.000   Median :28.00  
                    Mean   : 4.333   Mean   :25.67  
                    3rd Qu.: 6.000   3rd Qu.:28.50  
                    Max.   :10.000   Max.   :29.00  

Same for the Plasma data

#hint use summary() and or str()
summary(plasma)
    Control       Treatment 1     Treatment 2   
 Min.   :164.4   Min.   :159.5   Min.   :147.4  
 1st Qu.:170.8   1st Qu.:167.2   1st Qu.:153.4  
 Median :176.3   Median :171.9   Median :156.8  
 Mean   :175.9   Mean   :172.0   Mean   :155.7  
 3rd Qu.:179.9   3rd Qu.:176.2   3rd Qu.:157.7  
 Max.   :187.9   Max.   :190.8   Max.   :163.6  
                                 NA's   :2      

Let’s look deeper into plasma

We will start with plasma. We have names with spaces, let’s rename them !

#rename the columns of plasma data to "control","treatment_1","treatment_2"
#hint: use names() and a vector of names i.e. c("..",  "..")

names(plasma) <- c("control","treatment_1","treatment_2")
plasma
# A tibble: 20 x 3
   control treatment_1 treatment_2
     <dbl>       <dbl>       <dbl>
 1    164.        191.        164.
 2    173.        171.        153.
 3    178.        184.        154.
 4    170.        172.        152.
 5    183.        176.        158.
 6    172.        168         156.
 7    188.        182.        157 
 8    178.        160.        160.
 9    186.        173.        160.
10    176.        170.        148.
11    171.        177.        153.
12    177.        162.        158.
13    171.        176.        158.
14    182.        177.        156.
15    179.        165.        157.
16    187.        160.        154.
17    166         165.        147.
18    178.        173.        159.
19    167.        169.         NA 
20    172.        172.         NA 

Wide to long format

Wide, or unstacked data is presented with each different data variable in a separate column.

Long, or narrow data is presented with one column containing all the values and another column listing the context of the values.

Many functions in R expect data to be in a long format rather than a wide format.

Here the data is in a wide format, let’s put it in long format so we can use all the power of mist R functions.

For that we use the package tidyr

plasma_long <- pivot_longer(data = plasma,c("control","treatment_1","treatment_2"))
plasma_long
# A tibble: 60 x 2
   name        value
   <chr>       <dbl>
 1 control      164.
 2 treatment_1  191.
 3 treatment_2  164.
 4 control      173.
 5 treatment_1  171.
 6 treatment_2  153.
 7 control      178.
 8 treatment_1  184.
 9 treatment_2  154.
10 control      170.
# ... with 50 more rows
# i Use `print(n = ...)` to see more rows

Reflect: Do you see the differences ?

Now do a boxplot : x = name, y = value and add point

#hint: use geom_boxplot + geom_jitter
#hint2: you might want to rename the column ?
#bonus: try to do the same plot with plasma (the one in wide format)
 

ggplot(plasma_long,
       aes(x = name,y= value))+
geom_boxplot()+
geom_jitter()

Let’s look deeper into bisphenol

Contingency tables of some experiment, to check is groups are of the same length.

df_bisphenol$tot <- df_bisphenol$Cancer + df_bisphenol$NoCancer
df_bisphenol
# A tibble: 3 x 4
  group               Cancer NoCancer   tot
  <chr>                <dbl>    <dbl> <dbl>
1 Control                  1       29    30
2 Moderate exposition      2       28    30
3 High exposition         10       20    30

Let’s try a barplot with this wide format data:

First just plot the “cancer”

NB: we use geom_col

wide_plot <- ggplot(df_bisphenol)+
  geom_col(aes(group,Cancer,fill = "cancer"))

wide_plot

Then add the “no cancer”

wide_plot+
  geom_col(aes(group,NoCancer,fill = "no cancer"))

#### Problem!

The second bar is over the first one! Need to be in long format too! Can you do it ?

#hint: use pivot_longer

df_bisphenol_long <- df_bisphenol %>% 
  pivot_longer(c("Cancer","NoCancer"))

# names(df_bisphenol_long) <- c("group","output","Nindividual")

  ggplot(df_bisphenol_long,
         aes(group,value,fill = name))+
  geom_col()+
  coord_flip()

Change from “stacked” plot to “dodged” plot

#hint: do the same thing but add position = "dodge" to geom_col()
  ggplot(df_bisphenol_long,
         aes(group,value,fill = name))+
  geom_col(position = "dodge")+
  coord_flip()

Add percents

#hint do the same thing as step one but add scale_y_continuous(labels = percent_format(accuracy = 1)) at the end

ggplot(df_bisphenol_long,
         aes(group,value,fill = name))+
  geom_col(position = "fill")+
  coord_flip()+
  scale_y_continuous(labels = percent_format(accuracy = 1))

More Visualisation Stuff

Use themes

Default
violin <- ggplot(data = il18, aes(x = dis.act, y = freeIL18, fill=dis.act )) +
  geom_violin() + 
  xlab("Disease activity")

violin + theme_gray() # the default

Black and White
violin + theme_bw() # the default

Classic
violin + theme_classic() # the default

Interactive plots

The package plotly helps creating awesome interactive data vizualisation and maks it super easy to do !

First load it

#install.packages(plotly) # install it if you don't have it

library(plotly)

Lets redo the last scatter plot but this time with a twist !

plot_object <- ggplot() + 
  geom_point(data = il18,
             aes(x = freeIL18, y = TotalIL18,  color=dis.act)) +
  geom_point()

ggplotly(plot_object)